/*==============================================================================
FILE NAME: Create_region_complaints.do
INPUTS: incidents.dta
OUTPUTS: region complaints.dta, region air complaints.dta
CREATED: 6 January 2022
UPDATED: 7 August 2024
==============================================================================*/

// Create TCEQ region-by-month panel of complaints
set more off

/* Set directory if working independently through code */
if c(username)=="" { //insert username
    global rootdir "" // insert root path
    global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 

// Create monthly count of complaints by TCEQ region
use "$processed_data/incidents.dta", clear
keep ComplaintIncident TCEQRegion IncidentRecDate IncidentStatus
duplicates drop
isid ComplaintIncident

// Convert incident date to Stata date format and clean
gen temp = date(IncidentRecDate,"MDY")
drop IncidentRecDate 
rename temp IncidentRecDate
format IncidentRecDate %td
replace IncidentRecDate = . if IncidentRecDate < 0

// Generate year, month, and monthly date variables
gen year=year(IncidentRecDate)
gen month=month(IncidentRecDate)
gen mdate=ym(year,month)
format mdate %tm

// Restrict data to 2003–2019
keep if mdate>=tm(2003m1) & mdate<=tm(2019m12)

// Tabulate and drop referred complaints
tab IncidentStatus
drop if IncidentStatus=="REFERRED"
// 15% of complaints are referred outside TCEQ; drop these

// Drop unnecessary columns and create complaint indicator
drop year month IncidentRecDate ComplaintIncident IncidentStatus
gen complaint=1
sort TCEQRegion

// Create numeric region identifier
egen region_id=group(TCEQRegion)
label var region_id "numeric identifier for TCEQRegion"
drop TCEQRegion
sort region_id mdate

// Collapse to get number of complaints per region per month
collapse (sum) complaint, by(region_id mdate)
rename complaint region_complaint
label var region_complaint "# complaints in region-month"
sort region_id mdate

// Set up panel structure and fill in missing months
xtset region_id mdate
tsfill, full
replace region_complaint=0 if region_complaint==.
spbalance

// Save temporary dataset
save "$processed_data/temp.dta", replace

// Calculate total complaints per month across all regions
sort mdate
collapse (sum) region_complaint, by(mdate)
rename region_complaint total_complaint
label var total_complaint "# complaints in month"
sort mdate

// Merge total complaints back to region-month data
merge 1:m mdate using "$processed_data/temp.dta"
drop _merge

// Calculate complaints in other regions
gen diff_region_complaint=total_complaint-region_complaint
label var diff_region_complaint "# of complaints in month in different regions"
drop total_complaint
sort region_id mdate

// Summarize and balance panel
sum region_complaint diff_region_complaint, detail
spbalance

// Save final region complaints dataset
save "$processed_data/region complaints.dta", replace

// Create monthly count of air complaints by region
use "$processed_data/incidents.dta", clear
keep if Media=="AIR"
keep ComplaintIncident TCEQRegion IncidentRecDate IncidentStatus
duplicates drop
isid ComplaintIncident

// Convert incident date to Stata date format and clean
gen temp = date(IncidentRecDate,"MDY")
drop IncidentRecDate 
rename temp IncidentRecDate
format IncidentRecDate %td
replace IncidentRecDate = . if IncidentRecDate < 0

// Generate year, month, and monthly date variables
gen year=year(IncidentRecDate)
gen month=month(IncidentRecDate)
gen mdate=ym(year,month)
format mdate %tm

// Restrict data to 2003–2019
keep if mdate>=tm(2003m1) & mdate<=tm(2019m12)

// Tabulate and drop referred air complaints
tab IncidentStatus
drop if IncidentStatus=="REFERRED"
// almost 6% of air complaints are referred outside TCEQ; drop these

// Drop unnecessary columns and create air complaint indicator
drop year month IncidentRecDate ComplaintIncident IncidentStatus
gen air_complaint=1
sort TCEQRegion

// Create numeric region identifier
egen region_id=group(TCEQRegion)
label var region_id "numeric identifier for TCEQRegion"
drop TCEQRegion
sort region_id mdate

// Collapse to get number of air complaints per region per month
collapse (sum) air_complaint, by(region_id mdate)
rename air_complaint region_air_complaint
label var region_air_complaint "# air complaints in region-month"
sort region_id mdate

// Set up panel structure and fill in missing months
xtset region_id mdate
tsfill, full
replace region_air_complaint=0 if region_air_complaint==.
spbalance

// Save temporary dataset
save "$processed_data/temp.dta", replace

// Calculate total air complaints per month across all regions
sort mdate
collapse (sum) region_air_complaint, by(mdate)
rename region_air_complaint total_air_complaint
label var total_air_complaint "# air complaints in month"
sort mdate

// Merge total air complaints back to region-month data
merge 1:m mdate using "$processed_data/temp.dta"
drop _merge

// Calculate air complaints in other regions
gen diff_region_air_complaint=total_air_complaint-region_air_complaint
label var diff_region_air_complaint "# of air complaints in month in different regions"
drop total_air_complaint
sort region_id mdate

// Summarize and balance panel
sum region_air_complaint diff_region_air_complaint, detail
spbalance

// Save final region air complaints dataset
save "$processed_data/region air complaints.dta", replace